Understand where users spend most of their time. This can be overall main locations across all times (say over a week) or within a specific time frame (for example a day). Over a week, this would be home and office, and if it is within a day or so, we might have another cluster coming up on Tuesdays's as they might play tennis (or so). The goal is quite open on purpose, please feel free to come up with whatever you think makes the most sense!
The data is from a research project and gives [timestamp, latitude, longitude]. Feel free to use one file or as many as makes sense for what you need/want to test. The data is from this research project: http://extrasensory.ucsd.edu/ Or from Google Drive here: GDrive data
Please use any python packages that you find useful. As mentioned, DBSCAN might be interesting DBSCAN, DBSCAN Tutorial. But again, there are pros and cons to it. Use whatever you find useful.
The task is basically unsupervised learning related with clustering of time series data, so basically algorithms working for time series data will work better as they will consider time. Let's try to visualize initially the data with a heat map.
from os.path import join as pjoin
# from google.colab import drive
# mounted_drive_folder = '/content/data'
# data_path = 'ExtraSensory.per_uuid_absolute_location'
# drive.mount(mounted_drive_folder)
# !ls /content/data/My\ Drive/ExtraSensory.per_uuid_absolute_location | head
!ls data | head
import pandas as pd
# data_path = 'ExtraSensory.per_uuid_absolute_location'
# one_file_path = pjoin(
# mounted_drive_folder,
# 'My Drive',
# data_path,
# '00EABED2-271D-49D8-B599-1D4A09240601.absolute_locations.csv.gz'
# )
one_file_path = pjoin(
'data',
'00EABED2-271D-49D8-B599-1D4A09240601.absolute_locations.csv.gz'
)
data = pd.read_csv(one_file_path, nrows=100, compression='gzip')
data.head(10)
import glob
# data_files_path_pattern = pjoin(
# mounted_drive_folder,
# 'My Drive',
# data_path,
# '*.csv.gz'
# )
data_files_path_pattern = pjoin(
'data',
'*.csv.gz'
)
locations_data = pd.concat(
pd.read_csv(
file_name,
compression='gzip',
error_bad_lines=False
) for file_name in glob.glob(data_files_path_pattern)
).sort_values(
'timestamp',
ascending=True
).astype(
{
'timestamp': 'int',
'latitude': 'float',
'longitude': 'float',
}
)
locations_data['timestamp'] = pd.to_datetime(
locations_data['timestamp'],
unit='s'
)
locations_data_ts = locations_data.set_index('timestamp', drop=True)
locations_data_ts.head()
!pip install gmaps
!pip install geojson
!pip install tslearn
!pip install shapely
!jupyter labextension install @jupyter-widgets/jupyterlab-manager
!jupyter labextension install @bokeh/jupyter_bokeh
!pip install google-api-python-client
!pip install googlemaps
# Some import snippets taken from previous Cracow air pollution research
import warnings
import cufflinks as cf
import plotly.offline as offline
cf.go_offline()
offline.init_notebook_mode()
from bokeh.io import show, output_notebook
from bokeh.models import ColumnDataSource, GMapOptions
from bokeh.plotting import gmap
from bokeh.models import HoverTool
from bokeh.io import output_file, show, push_notebook
from bokeh.models.widgets import CheckboxGroup
from bokeh.models.widgets import DateRangeSlider
from bokeh.models import CustomJS
from bokeh.layouts import layout
from datetime import datetime
from bokeh.palettes import Spectral11
from bokeh.plotting import figure
# Plotting whisker
from bokeh.models import ColumnDataSource, Whisker
from bokeh.plotting import figure, show
from bokeh.sampledata.autompg import autompg as df
import gmaps
import gmaps.datasets
import geojson
from geojson import Feature, \
Point, \
FeatureCollection, \
LineString, \
Polygon, \
MultiPolygon
#imputation of missing values
from sklearn.cluster import KMeans, DBSCAN
from tslearn.clustering import silhouette_score, TimeSeriesKMeans
import ipywidgets as widgets
from datetime import datetime
from IPython.display import display
from ipywidgets.embed import embed_minimal_html
warnings.filterwarnings('ignore')
output_notebook()
GOOGLE_API_KEY = 'AIzaSyCvXOQ2bgpTlze4a6DOyC24jDqWylRu9Ck'
localize_latitude = locations_data_ts['latitude'].median()
localize_longitude = locations_data_ts['longitude'].median()
map_options = GMapOptions(
lat=localize_latitude,
lng=localize_longitude,
map_type='roadmap',
zoom=11
)
source = ColumnDataSource(
data=locations_data_ts[::3]
)
p = gmap(
GOOGLE_API_KEY,
map_options,
title='Tracking of user positions',
plot_width=1200,
plot_height=1200
)
circles = p.circle(
x='longitude',
y='latitude',
size=8,
fill_color='blue',
fill_alpha=0.8,
source=source
)
hover = HoverTool(tooltips=[('index', '$index'),
('(longitude,latitude)', '(@longitude, @latitude)'),
('sensor id', '@id')], renderers=[circles])
p.add_tools(hover)
output_file('resampled_coordinates.html')
show(p)
len(locations_data_ts[locations_data_ts.isnull().any(axis=1)]), len(locations_data_ts)
import numpy as np
# There are more complex interpolation methods which can be useful here
locations_data_ts_interp = locations_data_ts.interpolate(method='linear')
len(locations_data_ts_interp[locations_data_ts_interp.isnull().any(axis=1)])
fig = gmaps.figure()
gmaps.configure(api_key=GOOGLE_API_KEY)
locations = list(
zip(
locations_data_ts_interp['latitude'],
locations_data_ts_interp['longitude']
)
)
weights = [1 for _ in range(len(locations))]
if min(weights) < 0:
weights = list(map(lambda weight: weight - min(weights), weights))
fig = gmaps.figure(center=(localize_latitude, localize_longitude,), zoom_level=10)
layer_of_tracker_map = gmaps.heatmap_layer(locations, weights=weights, dissipating = True)
layer_of_tracker_map.point_radius = 5
layer_of_tracker_map.max_intensity = 100
fig.add_layer(layer_of_tracker_map)
fig
embed_minimal_html('pure_heatmap.html', views=[fig])
Here we can find some red zones where there user was the most of the time, though the API doesn't provide a way to find cluster centers. Let's find these clusters by DBSCAN algorithm (Other options as time series clustering algorithm TimeSeriesKMeans can be used)
Let's estimate epsilon parameter as the maximum distance between points to consider them being in one cluster. Let's assume 2 km distance, then having kms_per_radian=6371.0088 we can compute a value for epsilon in terms of lat and lon. We can tweak min_samples from 1 to larger value to consider some points as a noise.
We need to use haversine metric and ball tree algorithm to calculate great circle distances between points. Epsilon and latitude and longitude coordinates should be converted to radians.
Due to this explanation: https://stackoverflow.com/questions/44131411/dbscan-handling-big-data-crashes-and-memory-error and https://github.com/scikit-learn/scikit-learn/issues/5275 we can easily go out of memory. Just not to go out of memory here I resample the original dataset taking e.g. every 20th.
from sklearn.cluster import DBSCAN
np.random.seed(42)
# Scaling can be done with time windows
kms_per_radian = 6371.0088
eps = .5 / kms_per_radian
# coordinates_stantardized = StandardScaler().fit_transform(locations_data_ts_interp)
resampling_rate = 10
coordinates = locations_data_ts_interp.iloc[::resampling_rate].as_matrix()
radian_coordinates = np.radians(coordinates)
dbscan = DBSCAN(eps=eps, min_samples=5).fit(radian_coordinates)
labels = dbscan.labels_
number_of_clusters = len(set(labels))
clusters = pd.Series([coordinates[labels == n] for n in range(number_of_clusters)])
print('Number of clusters: {}'.format(number_of_clusters))
from geopy.distance import great_circle
from shapely.geometry import MultiPoint
def find_centermost_point(cluster):
if not len(cluster):
return (None, None)
centroid = (MultiPoint(cluster).centroid.x, MultiPoint(cluster).centroid.y)
centermost_point = min(cluster, key=lambda point: great_circle(point, centroid).m)
return tuple(centermost_point)
centermost_points = clusters.map(find_centermost_point)
cluster_center_values = [(lon, lat) for lon, lat in centermost_points.values if lon is not None and lat is not None]
fig = gmaps.figure()
gmaps.configure(api_key=GOOGLE_API_KEY)
locations = list(
zip(
locations_data_ts_interp['latitude'],
locations_data_ts_interp['longitude']
)
)
weights = [1 for _ in range(len(locations))]
if min(weights) < 0:
weights = list(map(lambda weight: weight - min(weights), weights))
fig = gmaps.figure(center=(localize_latitude, localize_longitude,), zoom_level=10)
layer_of_tracker_map = gmaps.heatmap_layer(locations, weights=weights, dissipating = True)
layer_of_tracker_map.point_radius = 5
layer_of_tracker_map.max_intensity = 100
fig.add_layer(layer_of_tracker_map)
symbols = gmaps.symbol_layer(cluster_center_values, stroke_color=['blue']*len(cluster_center_values), fill_color=['blue']*len(cluster_center_values))
fig.add_layer(symbols)
fig
embed_minimal_html('initial_clustering_attempt.html', views=[fig])
Another way to undersample is to find those samples when some sort of speed of user was not large, then user stayed in the same place for a while. It can be done by searching for the local minimum points e.g. by the following way:
from scipy.signal import argrelextrema
local_min_order = 2
speed = np.diff(np.sqrt(radian_coordinates[:, 0] ** 2 + radian_coordinates[:, 1] ** 2))
min_speed_indices = argrelextrema(speed, np.less, order=local_min_order)[0]
eps = 0.1 / kms_per_radian
coordinates = locations_data_ts_interp.iloc[min_speed_indices].as_matrix()
radian_coordinates = np.radians(coordinates)
dbscan = DBSCAN(eps=eps, min_samples=5).fit(radian_coordinates)
labels = dbscan.labels_
number_of_clusters = len(set(labels))
clusters = pd.Series([coordinates[labels == n] for n in range(number_of_clusters)])
print('Number of clusters: {}'.format(number_of_clusters))
centermost_points = clusters.map(find_centermost_point)
cluster_center_values = [(lon, lat) for lon, lat in centermost_points.values if lon is not None and lat is not None]
# Compute support per every cluster center as a number of points in a cluster
# And take value from the spectrum to visualize point
support_values = [len(val) for val in clusters.values if len(val)]
spectral_val_fraction = 1. / len(Spectral11)
starting = 0
ranged_to_color = {}
for idx in range(len(Spectral11)):
ranged_to_color[(starting, starting + spectral_val_fraction)] = Spectral11[idx]
starting += spectral_val_fraction
max_val = max(support_values)
support_values_normalized = [float(i)/max_val for i in support_values]
mypalette = [[val for key, val in ranged_to_color.items() if key[0] < support < key[1]][0] for support in support_values_normalized]
# Find cluster center location names
import googlemaps as gmaps_api_cli
gmaps_cli = gmaps_api_cli.Client(key=GOOGLE_API_KEY)
cluster_center_geocodes = [gmaps_cli.reverse_geocode(val) for val in cluster_center_values]
cluster_center_names = [geocode[0]['formatted_address'] for geocode in cluster_center_geocodes]
NUMBER_OF_PLACED_TO_TAKE = 4
places_nearby_cluster_centers = [gmaps_cli.places_nearby(el, 50, type='specific business type') for el in cluster_center_values]
places_nearby_cluster_centers = ['; '.join([el['name'] for el in place['results']][:NUMBER_OF_PLACED_TO_TAKE]) for place in places_nearby_cluster_centers]
hover_description_text = [f'{gcode_name}; {place_name}; Support: {support_val}'
for gcode_name, place_name, support_val in zip(cluster_center_names, places_nearby_cluster_centers, support_values)]
places_df = pd.DataFrame({
'place_names': places_nearby_cluster_centers,
'cluster_center_names': cluster_center_names,
'cluster_center_values': cluster_center_values,
'support_values': support_values
}).sort_values('support_values', ascending=False)
pd.set_option('display.max_colwidth', -1)
places_df[places_df.support_values>100]
pd.reset_option('display.max_colwidth')
Place names can be also processed to find the most common tokens and find the exact place names the user visited, different parameters of the model can be tuned, some metrics such as silhouette score, within cluster sum of squares and elbow rule can be used to tweak the result to the optimal number of clusters.
It's quite possible that the user buys books or works at: "UC San Diego Bookstore",
works or uses services at: 100-198 W Hawthorn St; Sage Payment Solutions;
rents apartments: Solana Highlands Apartments Sixth College Matthews Apartments
the user can be a medical colleague student, worker or client, visits medical center.
fig = gmaps.figure()
gmaps.configure(api_key=GOOGLE_API_KEY)
locations = list(
zip(
locations_data_ts_interp['latitude'],
locations_data_ts_interp['longitude']
)
)
weights = [1 for _ in range(len(locations))]
if min(weights) < 0:
weights = list(map(lambda weight: weight - min(weights), weights))
fig = gmaps.figure(center=(localize_latitude, localize_longitude,), zoom_level=10)
layer_of_tracker_map = gmaps.heatmap_layer(locations, weights=weights, dissipating = True)
layer_of_tracker_map.point_radius = 5
layer_of_tracker_map.max_intensity = 100
fig.add_layer(layer_of_tracker_map)
symbols = gmaps.symbol_layer(
cluster_center_values,
stroke_color=mypalette,
fill_color=mypalette,
hover_text=hover_description_text,
display_info_box=True,
info_box_content=hover_description_text,
scale=4
)
fig.add_layer(symbols)
fig
embed_minimal_html('clustering_result.html', views=[fig])
# Requires bokeh server
# min_date = locations_data_ts.index.min()
# max_date = locations_data_ts.index.max()
# date_range_slider = DateRangeSlider(
# title='Date Range: ',
# start=min_date,
# end=max_date,
# value=(
# min_date,
# max_date
# ),
# step=1
# )
# def plot_sensor_time_series(locations_data_ts_filtered):
# map_options = GMapOptions(
# lat=localize_latitude,
# lng=localize_longitude,
# map_type='roadmap',
# zoom=11
# )
# source = ColumnDataSource(
# data=locations_data_ts_filtered
# )
# p = gmap(
# GOOGLE_API_KEY,
# map_options,
# title='Tracking of user positions',
# plot_width=1200,
# plot_height=1200
# )
# circles = p.circle(
# x='longitude',
# y='latitude',
# size=8,
# fill_color='blue',
# fill_alpha=0.8,
# source=source
# )
# hover = HoverTool(tooltips=[('index', '$index'),
# ('(longitude,latitude)', '(@longitude, @latitude)'),
# ('sensor id', '@id')], renderers=[circles])
# p.add_tools(hover)
# show(p)
# def date_range_change_handler(attr, old, new):
# start_date, end_date = new
# df_to_plot = locations_data_ts.loc[start_date:end_date, float_sensor_data_columns]
# plot_sensor_time_series(df_to_plot)
# date_range_slider.on_change('value', date_range_change_handler)
# l = layout(children=[[date_range_slider]], sizing_mode='scale_width')
# handle = show(l, notebook_handle=True)
# push_notebook(handle=handle)
# All conda dependencies
!pip list